Group Members: Reid Brown, Cory Carr, Tappy Li, Kesia Ohene-Agyeman
library(leaps)
library(corrplot)
library(readr)
library(mosaic)
library(car)
library(dplyr)
library(Stat2Data)
AmesData = read_csv("/Users/reidbrown/Documents/Senior/Spring 2020/STOR 455/Class Example Data/AmesTrain12.csv")
source("/Users/reidbrown/Documents/Senior/Spring 2020/STOR 455/Class Scripts/ShowSubsets.R")
source("/Users/reidbrown/Documents/Senior/Spring 2020/STOR 455/Class Scripts/anova455.R")
#Drop first column which is "Order"
head(AmesData)
MyAmesData = AmesData %>% select(which(sapply(.,is.numeric)))
MyAmesData = MyAmesData[,2:27]
MyAmesData
Part 1
#Backward Selection Method
Full=lm(Price~., data=MyAmesData)
MSE=(summary(Full)$sigma)^2
BackwardMod1 = step(Full,scale=MSE, trace=FALSE)
BackwardMod1
##
## Call:
## lm(formula = Price ~ LotFrontage + LotArea + Quality + Condition +
## YearBuilt + YearRemodel + BasementFinSF + BasementSF + FirstSF +
## SecondSF + Bedroom + TotalRooms + Fireplaces + GarageSF +
## OpenPorchSF + EnclosedPorchSF + ScreenPorchSF, data = MyAmesData)
##
## Coefficients:
## (Intercept) LotFrontage LotArea Quality
## -1.405e+03 1.452e-01 5.042e-04 1.654e+01
## Condition YearBuilt YearRemodel BasementFinSF
## 6.716e+00 5.232e-01 1.361e-01 2.237e-02
## BasementSF FirstSF SecondSF Bedroom
## 1.608e-02 4.875e-02 4.212e-02 -8.235e+00
## TotalRooms Fireplaces GarageSF OpenPorchSF
## 5.241e+00 4.642e+00 3.809e-02 4.126e-02
## EnclosedPorchSF ScreenPorchSF
## 3.773e-02 3.854e-02
#Forward Selection Method
none=lm(Price~1,data=MyAmesData)
ForwardMod1 = step(none,scope=list(upper=Full),scale=MSE,direction="forward", trace=FALSE)
ForwardMod1
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## GarageSF + LotArea + Condition + BasementSF + LotFrontage +
## Bedroom + TotalRooms + Fireplaces + OpenPorchSF + YearRemodel +
## ScreenPorchSF + EnclosedPorchSF, data = MyAmesData)
##
## Coefficients:
## (Intercept) Quality GroundSF BasementFinSF
## -1.432e+03 1.622e+01 4.288e-02 2.260e-02
## YearBuilt GarageSF LotArea Condition
## 5.315e-01 4.098e-02 5.204e-04 6.690e+00
## BasementSF LotFrontage Bedroom TotalRooms
## 1.956e-02 1.472e-01 -8.206e+00 5.043e+00
## Fireplaces OpenPorchSF YearRemodel ScreenPorchSF
## 5.318e+00 4.179e-02 1.437e-01 3.955e-02
## EnclosedPorchSF
## 3.920e-02
#StepWise Selection Method -- default direction arg. is Stepwise method
StepwiseMod1 = step(none,scope=list(upper=Full),scale=MSE, trace=FALSE)
StepwiseMod1
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## GarageSF + LotArea + Condition + BasementSF + LotFrontage +
## Bedroom + TotalRooms + Fireplaces + OpenPorchSF + YearRemodel +
## ScreenPorchSF + EnclosedPorchSF, data = MyAmesData)
##
## Coefficients:
## (Intercept) Quality GroundSF BasementFinSF
## -1.432e+03 1.622e+01 4.288e-02 2.260e-02
## YearBuilt GarageSF LotArea Condition
## 5.315e-01 4.098e-02 5.204e-04 6.690e+00
## BasementSF LotFrontage Bedroom TotalRooms
## 1.956e-02 1.472e-01 -8.206e+00 5.043e+00
## Fireplaces OpenPorchSF YearRemodel ScreenPorchSF
## 5.318e+00 4.179e-02 1.437e-01 3.955e-02
## EnclosedPorchSF
## 3.920e-02
#Summaries of each model
summary(BackwardMod1)
##
## Call:
## lm(formula = Price ~ LotFrontage + LotArea + Quality + Condition +
## YearBuilt + YearRemodel + BasementFinSF + BasementSF + FirstSF +
## SecondSF + Bedroom + TotalRooms + Fireplaces + GarageSF +
## OpenPorchSF + EnclosedPorchSF + ScreenPorchSF, data = MyAmesData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -153.278 -16.024 -2.495 11.940 138.810
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.405e+03 1.463e+02 -9.602 < 2e-16 ***
## LotFrontage 1.452e-01 3.570e-02 4.066 5.44e-05 ***
## LotArea 5.042e-04 9.148e-05 5.512 5.33e-08 ***
## Quality 1.654e+01 1.392e+00 11.879 < 2e-16 ***
## Condition 6.716e+00 1.183e+00 5.677 2.16e-08 ***
## YearBuilt 5.232e-01 6.543e-02 7.996 6.95e-15 ***
## YearRemodel 1.361e-01 7.795e-02 1.746 0.081384 .
## BasementFinSF 2.237e-02 3.042e-03 7.353 6.60e-13 ***
## BasementSF 1.608e-02 5.108e-03 3.149 0.001722 **
## FirstSF 4.875e-02 6.376e-03 7.646 8.59e-14 ***
## SecondSF 4.212e-02 5.187e-03 8.120 2.79e-15 ***
## Bedroom -8.235e+00 2.068e+00 -3.982 7.70e-05 ***
## TotalRooms 5.241e+00 1.514e+00 3.461 0.000577 ***
## Fireplaces 4.642e+00 2.258e+00 2.056 0.040265 *
## GarageSF 3.809e-02 6.796e-03 5.604 3.24e-08 ***
## OpenPorchSF 4.126e-02 1.737e-02 2.376 0.017839 *
## EnclosedPorchSF 3.773e-02 1.989e-02 1.897 0.058326 .
## ScreenPorchSF 3.854e-02 1.962e-02 1.964 0.049964 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.28 on 582 degrees of freedom
## Multiple R-squared: 0.8737, Adjusted R-squared: 0.87
## F-statistic: 236.8 on 17 and 582 DF, p-value: < 2.2e-16
summary(ForwardMod1)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## GarageSF + LotArea + Condition + BasementSF + LotFrontage +
## Bedroom + TotalRooms + Fireplaces + OpenPorchSF + YearRemodel +
## ScreenPorchSF + EnclosedPorchSF, data = MyAmesData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -151.163 -15.762 -2.585 11.901 137.973
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.432e+03 1.465e+02 -9.777 < 2e-16 ***
## Quality 1.622e+01 1.382e+00 11.731 < 2e-16 ***
## GroundSF 4.288e-02 5.057e-03 8.481 < 2e-16 ***
## BasementFinSF 2.260e-02 3.052e-03 7.406 4.59e-13 ***
## YearBuilt 5.315e-01 6.589e-02 8.066 4.15e-15 ***
## GarageSF 4.098e-02 6.731e-03 6.089 2.06e-09 ***
## LotArea 5.204e-04 9.149e-05 5.688 2.04e-08 ***
## Condition 6.690e+00 1.188e+00 5.633 2.76e-08 ***
## BasementSF 1.956e-02 4.021e-03 4.864 1.49e-06 ***
## LotFrontage 1.472e-01 3.573e-02 4.120 4.33e-05 ***
## Bedroom -8.206e+00 2.065e+00 -3.975 7.93e-05 ***
## TotalRooms 5.043e+00 1.545e+00 3.265 0.00116 **
## Fireplaces 5.318e+00 2.235e+00 2.380 0.01763 *
## OpenPorchSF 4.179e-02 1.743e-02 2.397 0.01686 *
## YearRemodel 1.437e-01 7.812e-02 1.839 0.06640 .
## ScreenPorchSF 3.955e-02 1.968e-02 2.009 0.04497 *
## EnclosedPorchSF 3.920e-02 1.989e-02 1.970 0.04928 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.39 on 583 degrees of freedom
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.869
## F-statistic: 249.4 on 16 and 583 DF, p-value: < 2.2e-16
summary(StepwiseMod1)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## GarageSF + LotArea + Condition + BasementSF + LotFrontage +
## Bedroom + TotalRooms + Fireplaces + OpenPorchSF + YearRemodel +
## ScreenPorchSF + EnclosedPorchSF, data = MyAmesData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -151.163 -15.762 -2.585 11.901 137.973
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.432e+03 1.465e+02 -9.777 < 2e-16 ***
## Quality 1.622e+01 1.382e+00 11.731 < 2e-16 ***
## GroundSF 4.288e-02 5.057e-03 8.481 < 2e-16 ***
## BasementFinSF 2.260e-02 3.052e-03 7.406 4.59e-13 ***
## YearBuilt 5.315e-01 6.589e-02 8.066 4.15e-15 ***
## GarageSF 4.098e-02 6.731e-03 6.089 2.06e-09 ***
## LotArea 5.204e-04 9.149e-05 5.688 2.04e-08 ***
## Condition 6.690e+00 1.188e+00 5.633 2.76e-08 ***
## BasementSF 1.956e-02 4.021e-03 4.864 1.49e-06 ***
## LotFrontage 1.472e-01 3.573e-02 4.120 4.33e-05 ***
## Bedroom -8.206e+00 2.065e+00 -3.975 7.93e-05 ***
## TotalRooms 5.043e+00 1.545e+00 3.265 0.00116 **
## Fireplaces 5.318e+00 2.235e+00 2.380 0.01763 *
## OpenPorchSF 4.179e-02 1.743e-02 2.397 0.01686 *
## YearRemodel 1.437e-01 7.812e-02 1.839 0.06640 .
## ScreenPorchSF 3.955e-02 1.968e-02 2.009 0.04497 *
## EnclosedPorchSF 3.920e-02 1.989e-02 1.970 0.04928 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.39 on 583 degrees of freedom
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.869
## F-statistic: 249.4 on 16 and 583 DF, p-value: < 2.2e-16
vif(BackwardMod1)
## LotFrontage LotArea Quality Condition
## 1.083407 1.163735 3.093569 1.483131
## YearBuilt YearRemodel BasementFinSF BasementSF
## 3.367110 2.158414 1.415640 3.260448
## FirstSF SecondSF Bedroom TotalRooms
## 4.428293 4.115268 2.291218 4.321898
## Fireplaces GarageSF OpenPorchSF EnclosedPorchSF
## 1.544143 1.737130 1.171583 1.406599
## ScreenPorchSF
## 1.096568
vif(ForwardMod1)
## Quality GroundSF BasementFinSF YearBuilt
## 3.027375 5.157769 1.414244 3.389425
## GarageSF LotArea Condition BasementSF
## 1.690700 1.155283 1.483580 2.005624
## LotFrontage Bedroom TotalRooms Fireplaces
## 1.076635 2.266101 4.463418 1.500369
## OpenPorchSF YearRemodel ScreenPorchSF EnclosedPorchSF
## 1.171602 2.151402 1.095027 1.396489
#vif(StepwiseMod1)
We conducted backwards selection, forward selection, and stepwise selection on the quantitative predictor variables in the Ames12 dataset. The Backwards selection method returned 17 predictors while the Forwards and Stepwise selection method returns only 16 predictors. The model produced by Forwards and Stepwise selection wound up being the exact same. In the backwards model, the adjusted R^2 value is 0.870, there are two insignificant predictors (YearRemodel and EnclosedPorchSF), and there are no predictors with a VIF value above 5, however, two are very close to this cutoff (FirstSF=4.428293 and SecondSF=4.115268). In the Forwards Selection model the adjusted R^2 value is 0.869, there is only one insignificant predictor (YearRemodel), and there is one predictor with a VIF above 5 (GroundSF). We think that the Forwards Selection model is best becuase there is a 0.001 decrease in R^2, but one less insignificant predictor.
Part 2
plot(ForwardMod1)
hist(ForwardMod1$residuals)
max(abs(ForwardMod1$residuals))
## [1] 151.1633
which.max(abs(ForwardMod1$residuals))
## 320
## 320
#max(abs(ForwardMod1$residuals))
rstandard(ForwardMod1)[320]
## 320
## -5.682328
rstudent(ForwardMod1)[320]
## 320
## -5.84152
plot(rstudent(ForwardMod1)~ForwardMod1$fitted.values, data = MyAmesData)
abline(0,0)
plot(rstandard(ForwardMod1)~ForwardMod1$fitted.values, data = MyAmesData)
abline(0,0)
summary(ForwardMod1)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## GarageSF + LotArea + Condition + BasementSF + LotFrontage +
## Bedroom + TotalRooms + Fireplaces + OpenPorchSF + YearRemodel +
## ScreenPorchSF + EnclosedPorchSF, data = MyAmesData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -151.163 -15.762 -2.585 11.901 137.973
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.432e+03 1.465e+02 -9.777 < 2e-16 ***
## Quality 1.622e+01 1.382e+00 11.731 < 2e-16 ***
## GroundSF 4.288e-02 5.057e-03 8.481 < 2e-16 ***
## BasementFinSF 2.260e-02 3.052e-03 7.406 4.59e-13 ***
## YearBuilt 5.315e-01 6.589e-02 8.066 4.15e-15 ***
## GarageSF 4.098e-02 6.731e-03 6.089 2.06e-09 ***
## LotArea 5.204e-04 9.149e-05 5.688 2.04e-08 ***
## Condition 6.690e+00 1.188e+00 5.633 2.76e-08 ***
## BasementSF 1.956e-02 4.021e-03 4.864 1.49e-06 ***
## LotFrontage 1.472e-01 3.573e-02 4.120 4.33e-05 ***
## Bedroom -8.206e+00 2.065e+00 -3.975 7.93e-05 ***
## TotalRooms 5.043e+00 1.545e+00 3.265 0.00116 **
## Fireplaces 5.318e+00 2.235e+00 2.380 0.01763 *
## OpenPorchSF 4.179e-02 1.743e-02 2.397 0.01686 *
## YearRemodel 1.437e-01 7.812e-02 1.839 0.06640 .
## ScreenPorchSF 3.955e-02 1.968e-02 2.009 0.04497 *
## EnclosedPorchSF 3.920e-02 1.989e-02 1.970 0.04928 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.39 on 583 degrees of freedom
## Multiple R-squared: 0.8725, Adjusted R-squared: 0.869
## F-statistic: 249.4 on 16 and 583 DF, p-value: < 2.2e-16
plot(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
GarageSF + LotArea + Condition + BasementSF + LotFrontage +
Bedroom + TotalRooms + Fireplaces + OpenPorchSF + YearRemodel +
ScreenPorchSF + EnclosedPorchSF, data = MyAmesData)
Looking at the price vs our predictors shows a fairly linear relationship across most predictors. In the cases where it isn’t linear, there are no better curves we could think of including. Linearity is satisfied.
After looking at the residuals vs leverage plot none of the residuals have large leverage because none of them go past the Cook’s Distance boundaries. Becuase of this we are not removing any data points. The raw, standardized, and studantized plots all show that the data is has zero mean beacuse the residuals all appear to be centered around the reference line. The QQ plot follows the reference line well, which indicates normality. The residuals vs fitted plot demonstrated that constant variance is satisfied since there is no apparent fan shape. However, the residuals vs fitted plot shows that independence does not appear to be satisfied, as there is a fairly clear pattern (with superimposed curve) to the residuals. This will not be fixed by adding or removing variables; it will only be fixed by adding a transformation, so we are leaving it be for now and acknowledging that this is not an ideal linear model.
Part 3
MyAmesData2 = MyAmesData
MyAmesData2$PorchSF = (MyAmesData2$ScreenPorchSF + MyAmesData2$EnclosedPorchSF)
for (i in c(2:27)) {
MyAmesData2[26+i] = log(MyAmesData2[i]+1)
}
MyAmesData2$ScreenPorchYN = (MyAmesData2$ScreenPorchSF !=0) * 1
MyAmesData2$EnclosedPorchYN = (MyAmesData2$EnclosedPorchSF !=0) * 1
MyAmesData2$PorchYN = (MyAmesData2$PorchSF !=0) * 1
#Backward Selection
Full2=lm(Price~., data=MyAmesData2)
MSE=(summary(Full2)$sigma)^2
BackwardMod2 = step(Full2,scale=MSE, trace=FALSE)
BackwardMod2
##
## Call:
## lm(formula = Price ~ LotArea + Quality + Condition + YearBuilt +
## YearRemodel + BasementFinSF + BasementUnFinSF + BasementSF +
## FirstSF + SecondSF + GroundSF + BasementFBath + HalfBath +
## Bedroom + TotalRooms + GarageCars + OpenPorchSF + ScreenPorchSF +
## LotFrontage.1 + LotArea.1 + Quality.1 + YearRemodel.1 + BasementFinSF.1 +
## GroundSF.1 + BasementFBath.1 + HalfBath.1 + Bedroom.1 + Fireplaces.1 +
## GarageCars.1 + GarageSF.1 + WoodDeckSF.1 + EnclosedPorchSF.1 +
## ScreenPorchSF.1 + ScreenPorchYN + EnclosedPorchYN, data = MyAmesData2)
##
## Coefficients:
## (Intercept) LotArea Quality
## 1.302e+06 1.829e-04 3.920e+01
## Condition YearBuilt YearRemodel
## 7.654e+00 5.061e-01 9.998e+01
## BasementFinSF BasementUnFinSF BasementSF
## 1.852e-02 -1.185e-02 3.338e-02
## FirstSF SecondSF GroundSF
## 4.340e-02 3.992e-02 5.063e-02
## BasementFBath HalfBath Bedroom
## 6.971e+01 -1.207e+02 -1.587e+01
## TotalRooms GarageCars OpenPorchSF
## 3.669e+00 5.673e+01 4.170e-02
## ScreenPorchSF LotFrontage.1 LotArea.1
## -1.940e-01 1.356e+00 9.032e+00
## Quality.1 YearRemodel.1 BasementFinSF.1
## -1.498e+02 -1.977e+05 -1.194e+00
## GroundSF.1 BasementFBath.1 HalfBath.1
## -8.371e+01 -9.977e+01 1.842e+02
## Bedroom.1 Fireplaces.1 GarageCars.1
## 4.612e+01 9.974e+00 -1.548e+02
## GarageSF.1 WoodDeckSF.1 EnclosedPorchSF.1
## 9.881e+00 7.201e-01 8.166e+00
## ScreenPorchSF.1 ScreenPorchYN EnclosedPorchYN
## 3.989e+01 -1.601e+02 -3.910e+01
#Forward Selection
none2=lm(Price~1,data=MyAmesData2)
ForwardMod2 = step(none2,scope=list(upper=Full2),scale=MSE,direction="forward", trace=FALSE)
ForwardMod2
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + Quality.1 + Condition.1 + BasementSF + GroundSF.1 +
## GarageCars + PorchSF + YearBuilt.1 + Fireplaces + LotFrontage.1 +
## OpenPorchSF + LotArea + TotalRooms.1 + Condition + FullBath +
## BasementFinSF.1 + BasementUnFinSF + BasementUnFinSF.1 + BasementSF.1 +
## GarageSF + WoodDeckSF + GarageCars.1 + GarageSF.1 + Bedroom +
## Bedroom.1 + TotalRooms, data = MyAmesData2)
##
## Coefficients:
## (Intercept) Quality GroundSF
## 2.421e+05 3.946e+01 7.707e-02
## BasementFinSF YearBuilt LotArea.1
## 1.642e-02 1.955e+01 1.034e+01
## Quality.1 Condition.1 BasementSF
## -1.531e+02 -2.213e+01 4.130e-02
## GroundSF.1 GarageCars PorchSF
## -6.046e+01 5.709e+01 3.796e-02
## YearBuilt.1 Fireplaces LotFrontage.1
## -3.693e+04 6.974e+00 1.865e+00
## OpenPorchSF LotArea TotalRooms.1
## 4.818e-02 2.715e-04 -5.687e+01
## Condition FullBath BasementFinSF.1
## 1.236e+01 -5.869e+00 -1.528e+00
## BasementUnFinSF BasementUnFinSF.1 BasementSF.1
## -1.999e-02 1.965e+00 -3.380e+00
## GarageSF WoodDeckSF GarageCars.1
## -2.672e-03 8.927e-03 -1.552e+02
## GarageSF.1 Bedroom Bedroom.1
## 1.117e+01 -1.815e+01 5.088e+01
## TotalRooms
## 1.174e+01
#Stepwise Selection
StepwiseMod2 = step(none2,scope=list(upper=Full2),scale=MSE, trace=FALSE)
StepwiseMod2
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + Quality.1 + BasementSF + GroundSF.1 + PorchSF +
## YearBuilt.1 + Fireplaces + LotFrontage.1 + OpenPorchSF +
## LotArea + TotalRooms.1 + Condition + FullBath + BasementFinSF.1 +
## BasementUnFinSF + BasementUnFinSF.1 + BasementSF.1 + GarageSF +
## WoodDeckSF + EnclosedPorchYN + EnclosedPorchSF.1 + ScreenPorchSF.1 +
## ScreenPorchYN, data = MyAmesData2)
##
## Coefficients:
## (Intercept) Quality GroundSF
## 3.040e+05 4.446e+01 9.782e-02
## BasementFinSF YearBuilt LotArea.1
## 1.427e-02 2.428e+01 9.086e+00
## Quality.1 BasementSF GroundSF.1
## -1.801e+02 4.317e-02 -9.261e+01
## PorchSF YearBuilt.1 Fireplaces
## -1.734e-01 -4.631e+04 7.130e+00
## LotFrontage.1 OpenPorchSF LotArea
## 2.201e+00 4.213e-02 2.826e-04
## TotalRooms.1 Condition FullBath
## 2.258e+01 8.800e+00 -7.515e+00
## BasementFinSF.1 BasementUnFinSF BasementUnFinSF.1
## -1.613e+00 -2.296e-02 2.665e+00
## BasementSF.1 GarageSF WoodDeckSF
## -3.522e+00 2.694e-02 1.422e-02
## EnclosedPorchYN EnclosedPorchSF.1 ScreenPorchSF.1
## -1.294e+02 3.252e+01 3.378e+01
## ScreenPorchYN
## -1.307e+02
summary(StepwiseMod2)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + Quality.1 + BasementSF + GroundSF.1 + PorchSF +
## YearBuilt.1 + Fireplaces + LotFrontage.1 + OpenPorchSF +
## LotArea + TotalRooms.1 + Condition + FullBath + BasementFinSF.1 +
## BasementUnFinSF + BasementUnFinSF.1 + BasementSF.1 + GarageSF +
## WoodDeckSF + EnclosedPorchYN + EnclosedPorchSF.1 + ScreenPorchSF.1 +
## ScreenPorchYN, data = MyAmesData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -122.548 -13.934 -1.971 12.599 100.480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.040e+05 6.462e+04 4.705 3.19e-06 ***
## Quality 4.446e+01 4.673e+00 9.513 < 2e-16 ***
## GroundSF 9.782e-02 9.285e-03 10.535 < 2e-16 ***
## BasementFinSF 1.427e-02 7.374e-03 1.935 0.053424 .
## YearBuilt 2.428e+01 5.026e+00 4.831 1.75e-06 ***
## LotArea.1 9.086e+00 2.990e+00 3.038 0.002488 **
## Quality.1 -1.801e+02 2.968e+01 -6.068 2.36e-09 ***
## BasementSF 4.317e-02 8.264e-03 5.223 2.46e-07 ***
## GroundSF.1 -9.261e+01 1.489e+01 -6.221 9.53e-10 ***
## PorchSF -1.734e-01 6.985e-02 -2.483 0.013320 *
## YearBuilt.1 -4.631e+04 9.823e+03 -4.714 3.05e-06 ***
## Fireplaces 7.130e+00 2.039e+00 3.497 0.000506 ***
## LotFrontage.1 2.201e+00 6.499e-01 3.387 0.000755 ***
## OpenPorchSF 4.213e-02 1.567e-02 2.688 0.007395 **
## LotArea 2.826e-04 1.073e-04 2.635 0.008648 **
## TotalRooms.1 2.258e+01 9.425e+00 2.396 0.016901 *
## Condition 8.800e+00 1.038e+00 8.475 < 2e-16 ***
## FullBath -7.515e+00 2.893e+00 -2.597 0.009636 **
## BasementFinSF.1 -1.613e+00 7.214e-01 -2.235 0.025779 *
## BasementUnFinSF -2.296e-02 8.351e-03 -2.750 0.006153 **
## BasementUnFinSF.1 2.665e+00 9.546e-01 2.792 0.005416 **
## BasementSF.1 -3.522e+00 1.684e+00 -2.092 0.036883 *
## GarageSF 2.694e-02 6.250e-03 4.310 1.92e-05 ***
## WoodDeckSF 1.422e-02 8.330e-03 1.707 0.088375 .
## EnclosedPorchYN -1.294e+02 3.294e+01 -3.930 9.55e-05 ***
## EnclosedPorchSF.1 3.252e+01 8.635e+00 3.766 0.000183 ***
## ScreenPorchSF.1 3.378e+01 1.623e+01 2.081 0.037846 *
## ScreenPorchYN -1.307e+02 7.285e+01 -1.795 0.073236 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.37 on 572 degrees of freedom
## Multiple R-squared: 0.901, Adjusted R-squared: 0.8963
## F-statistic: 192.7 on 27 and 572 DF, p-value: < 2.2e-16
summary(BackwardMod2)
##
## Call:
## lm(formula = Price ~ LotArea + Quality + Condition + YearBuilt +
## YearRemodel + BasementFinSF + BasementUnFinSF + BasementSF +
## FirstSF + SecondSF + GroundSF + BasementFBath + HalfBath +
## Bedroom + TotalRooms + GarageCars + OpenPorchSF + ScreenPorchSF +
## LotFrontage.1 + LotArea.1 + Quality.1 + YearRemodel.1 + BasementFinSF.1 +
## GroundSF.1 + BasementFBath.1 + HalfBath.1 + Bedroom.1 + Fireplaces.1 +
## GarageCars.1 + GarageSF.1 + WoodDeckSF.1 + EnclosedPorchSF.1 +
## ScreenPorchSF.1 + ScreenPorchYN + EnclosedPorchYN, data = MyAmesData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.38 -13.67 -1.77 12.62 96.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.302e+06 1.975e+05 6.596 9.76e-11 ***
## LotArea 1.829e-04 1.099e-04 1.665 0.096506 .
## Quality 3.920e+01 4.276e+00 9.166 < 2e-16 ***
## Condition 7.654e+00 1.040e+00 7.359 6.57e-13 ***
## YearBuilt 5.061e-01 6.006e-02 8.427 2.97e-16 ***
## YearRemodel 9.998e+01 1.513e+01 6.606 9.13e-11 ***
## BasementFinSF 1.852e-02 6.913e-03 2.679 0.007596 **
## BasementUnFinSF -1.185e-02 7.454e-03 -1.590 0.112324
## BasementSF 3.338e-02 8.069e-03 4.137 4.05e-05 ***
## FirstSF 4.340e-02 2.013e-02 2.157 0.031463 *
## SecondSF 3.992e-02 1.986e-02 2.010 0.044933 *
## GroundSF 5.063e-02 2.173e-02 2.330 0.020139 *
## BasementFBath 6.971e+01 2.528e+01 2.758 0.006013 **
## HalfBath -1.207e+02 2.567e+01 -4.703 3.23e-06 ***
## Bedroom -1.587e+01 6.221e+00 -2.551 0.011006 *
## TotalRooms 3.669e+00 1.330e+00 2.760 0.005976 **
## GarageCars 5.673e+01 1.090e+01 5.204 2.74e-07 ***
## OpenPorchSF 4.170e-02 1.486e-02 2.805 0.005198 **
## ScreenPorchSF -1.940e-01 1.191e-01 -1.629 0.103802
## LotFrontage.1 1.356e+00 6.243e-01 2.173 0.030230 *
## LotArea.1 9.032e+00 2.871e+00 3.146 0.001742 **
## Quality.1 -1.498e+02 2.675e+01 -5.603 3.30e-08 ***
## YearRemodel.1 -1.977e+05 2.996e+04 -6.600 9.52e-11 ***
## BasementFinSF.1 -1.194e+00 6.705e-01 -1.781 0.075394 .
## GroundSF.1 -8.371e+01 1.472e+01 -5.685 2.10e-08 ***
## BasementFBath.1 -9.977e+01 3.724e+01 -2.679 0.007595 **
## HalfBath.1 1.842e+02 3.826e+01 4.814 1.90e-06 ***
## Bedroom.1 4.612e+01 2.087e+01 2.209 0.027543 *
## Fireplaces.1 9.974e+00 3.107e+00 3.210 0.001403 **
## GarageCars.1 -1.548e+02 3.337e+01 -4.639 4.35e-06 ***
## GarageSF.1 9.881e+00 2.533e+00 3.900 0.000108 ***
## WoodDeckSF.1 7.201e-01 4.126e-01 1.745 0.081494 .
## EnclosedPorchSF.1 8.166e+00 3.544e+00 2.304 0.021581 *
## ScreenPorchSF.1 3.989e+01 2.512e+01 1.588 0.112876
## ScreenPorchYN -1.601e+02 1.087e+02 -1.473 0.141268
## EnclosedPorchYN -3.910e+01 1.747e+01 -2.239 0.025569 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.98 on 564 degrees of freedom
## Multiple R-squared: 0.9132, Adjusted R-squared: 0.9078
## F-statistic: 169.4 on 35 and 564 DF, p-value: < 2.2e-16
summary(ForwardMod2)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + Quality.1 + Condition.1 + BasementSF + GroundSF.1 +
## GarageCars + PorchSF + YearBuilt.1 + Fireplaces + LotFrontage.1 +
## OpenPorchSF + LotArea + TotalRooms.1 + Condition + FullBath +
## BasementFinSF.1 + BasementUnFinSF + BasementUnFinSF.1 + BasementSF.1 +
## GarageSF + WoodDeckSF + GarageCars.1 + GarageSF.1 + Bedroom +
## Bedroom.1 + TotalRooms, data = MyAmesData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.286 -12.787 -2.027 12.179 98.894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.421e+05 6.676e+04 3.626 0.000313 ***
## Quality 3.946e+01 4.755e+00 8.298 7.70e-16 ***
## GroundSF 7.707e-02 1.242e-02 6.208 1.03e-09 ***
## BasementFinSF 1.642e-02 7.395e-03 2.221 0.026758 *
## YearBuilt 1.955e+01 5.195e+00 3.763 0.000185 ***
## LotArea.1 1.034e+01 3.021e+00 3.423 0.000664 ***
## Quality.1 -1.531e+02 3.013e+01 -5.083 5.06e-07 ***
## Condition.1 -2.213e+01 3.489e+01 -0.634 0.526265
## BasementSF 4.130e-02 8.301e-03 4.976 8.62e-07 ***
## GroundSF.1 -6.046e+01 1.870e+01 -3.233 0.001295 **
## GarageCars 5.709e+01 1.480e+01 3.857 0.000128 ***
## PorchSF 3.796e-02 1.323e-02 2.869 0.004276 **
## YearBuilt.1 -3.693e+04 1.015e+04 -3.640 0.000298 ***
## Fireplaces 6.974e+00 2.067e+00 3.375 0.000789 ***
## LotFrontage.1 1.865e+00 6.592e-01 2.830 0.004821 **
## OpenPorchSF 4.818e-02 1.566e-02 3.077 0.002195 **
## LotArea 2.715e-04 1.069e-04 2.539 0.011374 *
## TotalRooms.1 -5.687e+01 4.804e+01 -1.184 0.236992
## Condition 1.236e+01 5.513e+00 2.243 0.025290 *
## FullBath -5.869e+00 3.035e+00 -1.933 0.053677 .
## BasementFinSF.1 -1.528e+00 7.217e-01 -2.117 0.034690 *
## BasementUnFinSF -1.999e-02 8.353e-03 -2.393 0.017031 *
## BasementUnFinSF.1 1.965e+00 9.706e-01 2.024 0.043406 *
## BasementSF.1 -3.380e+00 1.685e+00 -2.006 0.045305 *
## GarageSF -2.672e-03 1.290e-02 -0.207 0.835988
## WoodDeckSF 8.927e-03 8.337e-03 1.071 0.284693
## GarageCars.1 -1.552e+02 4.132e+01 -3.757 0.000190 ***
## GarageSF.1 1.117e+01 3.242e+00 3.444 0.000615 ***
## Bedroom -1.815e+01 6.646e+00 -2.730 0.006523 **
## Bedroom.1 5.088e+01 2.209e+01 2.304 0.021590 *
## TotalRooms 1.174e+01 6.524e+00 1.799 0.072535 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.28 on 569 degrees of freedom
## Multiple R-squared: 0.9022, Adjusted R-squared: 0.897
## F-statistic: 174.9 on 30 and 569 DF, p-value: < 2.2e-16
Try to see if the regular or log version of a predictor has a higher correlation with price. We only want to include either to original predictor or the log() version of that predictor. The above models sometimes included both the original predictor and the log() version of that predictor. This is fixed below.
MyAmesData3 = MyAmesData2[1]
corLinear = abs(cor(MyAmesData2$Price, MyAmesData2$LotFrontage))
corLog = abs(cor(MyAmesData2$Price, MyAmesData2$LotFrontage.1))
for (i in c(2:27)) {
corLinear = abs(cor(MyAmesData2$Price, MyAmesData2[i])[1])
corLog = abs(cor(MyAmesData2$Price, MyAmesData2[26+i])[1])
if (corLinear>corLog) {
MyAmesData3[i] = MyAmesData2[i]
}
else {
MyAmesData3[i] = MyAmesData2[26+i]
}
}
MyAmesData3$ScreenPorchYN = (MyAmesData2$ScreenPorchSF !=0) * 1
MyAmesData3$EnclosedPorchYN = (MyAmesData2$EnclosedPorchSF !=0) * 1
MyAmesData3$PorchYN = (MyAmesData2$PorchSF !=0) * 1
Try selection methods with new MyAmesData3
#Backward Selection
Full2=lm(Price~., data=MyAmesData3)
MSE=(summary(Full2)$sigma)^2
BackwardMod2 = step(Full2,scale=MSE, trace=FALSE)
BackwardMod2
##
## Call:
## lm(formula = Price ~ LotFrontage + LotArea.1 + Quality + Condition +
## YearBuilt + YearRemodel + BasementFinSF + BasementSF + FirstSF +
## SecondSF + Bedroom + TotalRooms + Fireplaces.1 + GarageSF +
## OpenPorchSF.1 + EnclosedPorchSF.1 + EnclosedPorchYN + PorchYN,
## data = MyAmesData3)
##
## Coefficients:
## (Intercept) LotFrontage LotArea.1
## -1.520e+03 1.137e-01 1.521e+01
## Quality Condition YearBuilt
## 1.655e+01 6.768e+00 5.268e-01
## YearRemodel BasementFinSF BasementSF
## 1.281e-01 2.211e-02 1.673e-02
## FirstSF SecondSF Bedroom
## 4.822e-02 4.393e-02 -9.503e+00
## TotalRooms Fireplaces.1 GarageSF
## 4.873e+00 6.775e+00 3.225e-02
## OpenPorchSF.1 EnclosedPorchSF.1 EnclosedPorchYN
## 9.655e-01 7.058e+00 -3.742e+01
## PorchYN
## 9.346e+00
#Forward Selection
none2=lm(Price~1,data=MyAmesData3)
ForwardMod2 = step(none2,scope=list(upper=Full2),scale=MSE,direction="forward", trace=FALSE)
ForwardMod2
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + GarageSF + Condition + BasementSF + Bedroom +
## LotFrontage + PorchSF.1 + TotalRooms + Fireplaces.1 + YearRemodel +
## OpenPorchSF.1, data = MyAmesData3)
##
## Coefficients:
## (Intercept) Quality GroundSF BasementFinSF YearBuilt
## -1.578e+03 1.619e+01 4.427e-02 2.258e-02 5.483e-01
## LotArea.1 GarageSF Condition BasementSF Bedroom
## 1.589e+01 3.485e-02 6.753e+00 1.900e-02 -9.262e+00
## LotFrontage PorchSF.1 TotalRooms Fireplaces.1 YearRemodel
## 1.153e-01 1.625e+00 4.508e+00 7.913e+00 1.345e-01
## OpenPorchSF.1
## 9.537e-01
#Stepwise Selection
StepwiseMod2 = step(none2,scope=list(upper=Full2),scale=MSE, trace=FALSE)
StepwiseMod2
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + GarageSF + Condition + BasementSF + Bedroom +
## LotFrontage + PorchSF.1 + TotalRooms + Fireplaces.1 + YearRemodel +
## OpenPorchSF.1, data = MyAmesData3)
##
## Coefficients:
## (Intercept) Quality GroundSF BasementFinSF YearBuilt
## -1.578e+03 1.619e+01 4.427e-02 2.258e-02 5.483e-01
## LotArea.1 GarageSF Condition BasementSF Bedroom
## 1.589e+01 3.485e-02 6.753e+00 1.900e-02 -9.262e+00
## LotFrontage PorchSF.1 TotalRooms Fireplaces.1 YearRemodel
## 1.153e-01 1.625e+00 4.508e+00 7.913e+00 1.345e-01
## OpenPorchSF.1
## 9.537e-01
summary(StepwiseMod2)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + GarageSF + Condition + BasementSF + Bedroom +
## LotFrontage + PorchSF.1 + TotalRooms + Fireplaces.1 + YearRemodel +
## OpenPorchSF.1, data = MyAmesData3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -164.422 -15.679 -3.107 12.135 133.895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.578e+03 1.509e+02 -10.452 < 2e-16 ***
## Quality 1.619e+01 1.379e+00 11.738 < 2e-16 ***
## GroundSF 4.427e-02 5.013e-03 8.830 < 2e-16 ***
## BasementFinSF 2.258e-02 3.038e-03 7.430 3.88e-13 ***
## YearBuilt 5.483e-01 6.590e-02 8.321 6.20e-16 ***
## LotArea.1 1.589e+01 2.572e+00 6.177 1.22e-09 ***
## GarageSF 3.485e-02 6.808e-03 5.118 4.20e-07 ***
## Condition 6.753e+00 1.179e+00 5.729 1.62e-08 ***
## BasementSF 1.900e-02 4.009e-03 4.739 2.70e-06 ***
## Bedroom -9.262e+00 2.074e+00 -4.467 9.54e-06 ***
## LotFrontage 1.153e-01 3.616e-02 3.189 0.00150 **
## PorchSF.1 1.625e+00 5.858e-01 2.774 0.00572 **
## TotalRooms 4.508e+00 1.541e+00 2.926 0.00357 **
## Fireplaces.1 7.913e+00 3.540e+00 2.235 0.02577 *
## YearRemodel 1.345e-01 7.779e-02 1.729 0.08441 .
## OpenPorchSF.1 9.537e-01 6.121e-01 1.558 0.11975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.31 on 584 degrees of freedom
## Multiple R-squared: 0.873, Adjusted R-squared: 0.8698
## F-statistic: 267.7 on 15 and 584 DF, p-value: < 2.2e-16
summary(BackwardMod2)
##
## Call:
## lm(formula = Price ~ LotFrontage + LotArea.1 + Quality + Condition +
## YearBuilt + YearRemodel + BasementFinSF + BasementSF + FirstSF +
## SecondSF + Bedroom + TotalRooms + Fireplaces.1 + GarageSF +
## OpenPorchSF.1 + EnclosedPorchSF.1 + EnclosedPorchYN + PorchYN,
## data = MyAmesData3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -153.629 -15.617 -2.989 11.825 133.687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.520e+03 1.535e+02 -9.904 < 2e-16 ***
## LotFrontage 1.137e-01 3.616e-02 3.143 0.00176 **
## LotArea.1 1.521e+01 2.610e+00 5.827 9.35e-09 ***
## Quality 1.655e+01 1.403e+00 11.796 < 2e-16 ***
## Condition 6.768e+00 1.177e+00 5.748 1.46e-08 ***
## YearBuilt 5.268e-01 6.803e-02 7.744 4.32e-14 ***
## YearRemodel 1.281e-01 7.773e-02 1.648 0.09996 .
## BasementFinSF 2.211e-02 3.038e-03 7.278 1.11e-12 ***
## BasementSF 1.673e-02 5.077e-03 3.295 0.00104 **
## FirstSF 4.822e-02 6.354e-03 7.590 1.28e-13 ***
## SecondSF 4.393e-02 5.141e-03 8.545 < 2e-16 ***
## Bedroom -9.503e+00 2.084e+00 -4.560 6.23e-06 ***
## TotalRooms 4.873e+00 1.514e+00 3.219 0.00136 **
## Fireplaces.1 6.775e+00 3.577e+00 1.894 0.05868 .
## GarageSF 3.225e-02 6.853e-03 4.706 3.16e-06 ***
## OpenPorchSF.1 9.655e-01 6.121e-01 1.577 0.11526
## EnclosedPorchSF.1 7.058e+00 4.049e+00 1.743 0.08186 .
## EnclosedPorchYN -3.742e+01 2.041e+01 -1.834 0.06723 .
## PorchYN 9.346e+00 4.089e+00 2.286 0.02262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.21 on 581 degrees of freedom
## Multiple R-squared: 0.8746, Adjusted R-squared: 0.8707
## F-statistic: 225.1 on 18 and 581 DF, p-value: < 2.2e-16
summary(ForwardMod2)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF + BasementFinSF + YearBuilt +
## LotArea.1 + GarageSF + Condition + BasementSF + Bedroom +
## LotFrontage + PorchSF.1 + TotalRooms + Fireplaces.1 + YearRemodel +
## OpenPorchSF.1, data = MyAmesData3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -164.422 -15.679 -3.107 12.135 133.895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.578e+03 1.509e+02 -10.452 < 2e-16 ***
## Quality 1.619e+01 1.379e+00 11.738 < 2e-16 ***
## GroundSF 4.427e-02 5.013e-03 8.830 < 2e-16 ***
## BasementFinSF 2.258e-02 3.038e-03 7.430 3.88e-13 ***
## YearBuilt 5.483e-01 6.590e-02 8.321 6.20e-16 ***
## LotArea.1 1.589e+01 2.572e+00 6.177 1.22e-09 ***
## GarageSF 3.485e-02 6.808e-03 5.118 4.20e-07 ***
## Condition 6.753e+00 1.179e+00 5.729 1.62e-08 ***
## BasementSF 1.900e-02 4.009e-03 4.739 2.70e-06 ***
## Bedroom -9.262e+00 2.074e+00 -4.467 9.54e-06 ***
## LotFrontage 1.153e-01 3.616e-02 3.189 0.00150 **
## PorchSF.1 1.625e+00 5.858e-01 2.774 0.00572 **
## TotalRooms 4.508e+00 1.541e+00 2.926 0.00357 **
## Fireplaces.1 7.913e+00 3.540e+00 2.235 0.02577 *
## YearRemodel 1.345e-01 7.779e-02 1.729 0.08441 .
## OpenPorchSF.1 9.537e-01 6.121e-01 1.558 0.11975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.31 on 584 degrees of freedom
## Multiple R-squared: 0.873, Adjusted R-squared: 0.8698
## F-statistic: 267.7 on 15 and 584 DF, p-value: < 2.2e-16
plot(ForwardMod2)
Clearly even with the transformations to the predictiors, the satisfication of the independence of errors conditions is not satisfied. So we are going to take the log of price and repeat the same steps.
MyAmesData4 = log(MyAmesData2[1])
#CorLinear = abs(cor(MyAmesData2$Price, MyAmesData2$LotFrontage)[1])
#CorLog = abs(cor(MyAmesData2$Price, MyAmesData2$LotFrontage.1)[1])
for (i in c(2:27)) {
CorLinear = abs(cor(log(MyAmesData2$Price), MyAmesData2[i])[1])
CorLog = abs(cor(log(MyAmesData2$Price), MyAmesData2[26+i])[1])
if(CorLinear > CorLog) {
MyAmesData4[i] = MyAmesData2[i]
}
else {
MyAmesData4[i] = MyAmesData2[26+i]
}
}
MyAmesData4$ScreenPorchYN = (MyAmesData2$ScreenPorchSF != 0) * 1
MyAmesData4$EnclosedPorchYN = (MyAmesData2$EnclosedPorchSF != 0) * 1
MyAmesData4$PorchYN = (MyAmesData2$PorchSF != 0) * 1
Backward:
Full3=lm(Price~., data=MyAmesData4)
MSE=(summary(Full3)$sigma)^2
BackMod3 = step(Full3,scale=MSE,trace=FALSE)
Forward:
none3=lm(Price~1, data=MyAmesData4)
ForwardMod3 = step(none3,scope=list(upper=Full3),scale=MSE,direction="forward",trace=FALSE)
#Forward selection
Stepwise Regression:
StepMod3 = step(none3,scope=list(upper=Full3),scale=MSE,trace=FALSE)
summary(BackMod3)
##
## Call:
## lm(formula = Price ~ LotFrontage + LotArea.1 + Quality + Condition +
## YearBuilt + YearRemodel + BasementFinSF + BasementUnFinSF.1 +
## BasementSF + FirstSF.1 + SecondSF + GroundSF.1 + Bedroom +
## TotalRooms.1 + Fireplaces.1 + GarageSF + OpenPorchSF.1 +
## EnclosedPorchSF.1 + EnclosedPorchYN + PorchYN, data = MyAmesData4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54687 -0.05956 0.00055 0.06623 0.40955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.134e+00 7.944e-01 -10.240 < 2e-16 ***
## LotFrontage 2.452e-04 1.716e-04 1.428 0.153702
## LotArea.1 7.638e-02 1.253e-02 6.095 2.00e-09 ***
## Quality 8.973e-02 6.811e-03 13.175 < 2e-16 ***
## Condition 5.693e-02 5.588e-03 10.188 < 2e-16 ***
## YearBuilt 3.544e-03 3.264e-04 10.856 < 2e-16 ***
## YearRemodel 8.624e-04 3.677e-04 2.346 0.019327 *
## BasementFinSF 9.409e-05 1.707e-05 5.512 5.35e-08 ***
## BasementUnFinSF.1 7.497e-03 4.219e-03 1.777 0.076073 .
## BasementSF 7.636e-05 2.986e-05 2.557 0.010812 *
## FirstSF.1 2.163e-01 6.080e-02 3.557 0.000405 ***
## SecondSF 1.391e-04 4.475e-05 3.108 0.001976 **
## GroundSF.1 1.393e-01 7.390e-02 1.884 0.060002 .
## Bedroom -2.013e-02 1.012e-02 -1.989 0.047160 *
## TotalRooms.1 9.804e-02 5.425e-02 1.807 0.071282 .
## Fireplaces.1 6.186e-02 1.705e-02 3.628 0.000311 ***
## GarageSF 1.290e-04 3.258e-05 3.960 8.42e-05 ***
## OpenPorchSF.1 5.618e-03 2.934e-03 1.915 0.056032 .
## EnclosedPorchSF.1 6.875e-02 1.922e-02 3.578 0.000376 ***
## EnclosedPorchYN -3.529e-01 9.676e-02 -3.647 0.000289 ***
## PorchYN 3.929e-02 1.941e-02 2.024 0.043378 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1289 on 579 degrees of freedom
## Multiple R-squared: 0.9029, Adjusted R-squared: 0.8995
## F-statistic: 269.1 on 20 and 579 DF, p-value: < 2.2e-16
summary(ForwardMod3)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF.1 + YearBuilt + LotArea.1 +
## Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 +
## YearRemodel + PorchSF.1 + PorchYN + OpenPorchSF.1, data = MyAmesData4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54412 -0.06434 -0.00266 0.07028 0.43175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.229e+00 7.537e-01 -10.918 < 2e-16 ***
## Quality 9.222e-02 6.643e-03 13.882 < 2e-16 ***
## GroundSF.1 3.638e-01 2.396e-02 15.183 < 2e-16 ***
## YearBuilt 3.373e-03 3.138e-04 10.750 < 2e-16 ***
## LotArea.1 8.564e-02 1.191e-02 7.188 2.01e-12 ***
## Condition 5.421e-02 5.592e-03 9.696 < 2e-16 ***
## BasementFinSF 7.789e-05 1.429e-05 5.450 7.43e-08 ***
## BasementSF 1.172e-04 1.917e-05 6.112 1.79e-09 ***
## GarageSF 1.490e-04 3.230e-05 4.615 4.84e-06 ***
## Fireplaces.1 6.837e-02 1.686e-02 4.056 5.66e-05 ***
## YearRemodel 1.078e-03 3.631e-04 2.968 0.00312 **
## PorchSF.1 4.979e-02 1.784e-02 2.791 0.00542 **
## PorchYN -2.194e-01 9.038e-02 -2.427 0.01553 *
## OpenPorchSF.1 4.890e-03 2.943e-03 1.662 0.09712 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1309 on 586 degrees of freedom
## Multiple R-squared: 0.8986, Adjusted R-squared: 0.8964
## F-statistic: 399.5 on 13 and 586 DF, p-value: < 2.2e-16
summary(StepMod3)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF.1 + YearBuilt + LotArea.1 +
## Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 +
## YearRemodel + PorchSF.1 + PorchYN + OpenPorchSF.1, data = MyAmesData4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54412 -0.06434 -0.00266 0.07028 0.43175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.229e+00 7.537e-01 -10.918 < 2e-16 ***
## Quality 9.222e-02 6.643e-03 13.882 < 2e-16 ***
## GroundSF.1 3.638e-01 2.396e-02 15.183 < 2e-16 ***
## YearBuilt 3.373e-03 3.138e-04 10.750 < 2e-16 ***
## LotArea.1 8.564e-02 1.191e-02 7.188 2.01e-12 ***
## Condition 5.421e-02 5.592e-03 9.696 < 2e-16 ***
## BasementFinSF 7.789e-05 1.429e-05 5.450 7.43e-08 ***
## BasementSF 1.172e-04 1.917e-05 6.112 1.79e-09 ***
## GarageSF 1.490e-04 3.230e-05 4.615 4.84e-06 ***
## Fireplaces.1 6.837e-02 1.686e-02 4.056 5.66e-05 ***
## YearRemodel 1.078e-03 3.631e-04 2.968 0.00312 **
## PorchSF.1 4.979e-02 1.784e-02 2.791 0.00542 **
## PorchYN -2.194e-01 9.038e-02 -2.427 0.01553 *
## OpenPorchSF.1 4.890e-03 2.943e-03 1.662 0.09712 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1309 on 586 degrees of freedom
## Multiple R-squared: 0.8986, Adjusted R-squared: 0.8964
## F-statistic: 399.5 on 13 and 586 DF, p-value: < 2.2e-16
plot(ForwardMod3)
R squared values are marginally better, the value before was .87 and now they are around .89
#Forward and Stepwise are the same model
vif(BackMod3)
## LotFrontage LotArea.1 Quality Condition
## 1.121446 1.536948 3.316212 1.481949
## YearBuilt YearRemodel BasementFinSF BasementUnFinSF.1
## 3.753977 2.150375 1.996687 2.021314
## BasementSF FirstSF.1 SecondSF GroundSF.1
## 4.991997 12.615737 13.719947 21.702070
## Bedroom TotalRooms.1 Fireplaces.1 GarageSF
## 2.456902 4.473049 1.556870 1.787843
## OpenPorchSF.1 EnclosedPorchSF.1 EnclosedPorchYN PorchYN
## 1.458297 44.903726 47.324415 2.605627
vif(ForwardMod3)
## Quality GroundSF.1 YearBuilt LotArea.1 Condition
## 3.059337 2.212522 3.363803 1.347010 1.439036
## BasementFinSF BasementSF GarageSF Fireplaces.1 YearRemodel
## 1.357394 1.995782 1.703536 1.475296 2.034132
## PorchSF.1 PorchYN OpenPorchSF.1
## 54.125663 54.800962 1.422760
We have high vif values in both due to the inclusion of the indicator variables that are highly correlated with the original variables they were derived from. We’ll remove those and make the comparison again. We opt to remove the indicator variable rather than the original variable to remain faithful to the original data as much as possible.
ForwardMod3a = lm(formula = Price ~ Quality + GroundSF.1 + YearBuilt + LotArea.1 +
Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 +
YearRemodel + PorchSF.1+ OpenPorchSF.1, data = MyAmesData4)
BackwardsMod3a = lm(formula = Price ~ LotFrontage + LotArea.1 + Quality + Condition +
YearBuilt + YearRemodel + BasementFinSF + BasementUnFinSF.1 +
BasementSF + FirstSF.1 + SecondSF + GroundSF.1 + Bedroom +
TotalRooms.1 + Fireplaces.1 + GarageSF + OpenPorchSF.1 +
EnclosedPorchSF.1 + PorchYN, data = MyAmesData4)
summary(ForwardMod3a)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF.1 + YearBuilt + LotArea.1 +
## Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 +
## YearRemodel + PorchSF.1 + OpenPorchSF.1, data = MyAmesData4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54498 -0.06373 -0.00087 0.07082 0.46924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.409e+00 7.532e-01 -11.164 < 2e-16 ***
## Quality 9.027e-02 6.621e-03 13.633 < 2e-16 ***
## GroundSF.1 3.672e-01 2.402e-02 15.289 < 2e-16 ***
## YearBuilt 3.475e-03 3.123e-04 11.126 < 2e-16 ***
## LotArea.1 8.634e-02 1.196e-02 7.219 1.63e-12 ***
## Condition 5.504e-02 5.604e-03 9.822 < 2e-16 ***
## BasementFinSF 7.948e-05 1.434e-05 5.544 4.47e-08 ***
## BasementSF 1.197e-04 1.923e-05 6.225 9.17e-10 ***
## GarageSF 1.518e-04 3.241e-05 4.683 3.51e-06 ***
## Fireplaces.1 6.975e-02 1.692e-02 4.123 4.28e-05 ***
## YearRemodel 1.052e-03 3.645e-04 2.886 0.00404 **
## PorchSF.1 7.031e-03 2.796e-03 2.514 0.01219 *
## OpenPorchSF.1 4.963e-03 2.955e-03 1.680 0.09358 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1315 on 587 degrees of freedom
## Multiple R-squared: 0.8976, Adjusted R-squared: 0.8955
## F-statistic: 428.8 on 12 and 587 DF, p-value: < 2.2e-16
summary(BackwardsMod3a)
##
## Call:
## lm(formula = Price ~ LotFrontage + LotArea.1 + Quality + Condition +
## YearBuilt + YearRemodel + BasementFinSF + BasementUnFinSF.1 +
## BasementSF + FirstSF.1 + SecondSF + GroundSF.1 + Bedroom +
## TotalRooms.1 + Fireplaces.1 + GarageSF + OpenPorchSF.1 +
## EnclosedPorchSF.1 + PorchYN, data = MyAmesData4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55036 -0.06030 -0.00128 0.06443 0.41341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.260e+00 8.020e-01 -10.299 < 2e-16 ***
## LotFrontage 2.771e-04 1.732e-04 1.599 0.110267
## LotArea.1 7.739e-02 1.266e-02 6.112 1.80e-09 ***
## Quality 8.747e-02 6.854e-03 12.762 < 2e-16 ***
## Condition 5.714e-02 5.646e-03 10.120 < 2e-16 ***
## YearBuilt 3.622e-03 3.292e-04 11.002 < 2e-16 ***
## YearRemodel 8.362e-04 3.715e-04 2.251 0.024761 *
## BasementFinSF 9.630e-05 1.724e-05 5.586 3.58e-08 ***
## BasementUnFinSF.1 7.254e-03 4.263e-03 1.702 0.089328 .
## BasementSF 8.197e-05 3.014e-05 2.720 0.006731 **
## FirstSF.1 2.039e-01 6.134e-02 3.325 0.000941 ***
## SecondSF 1.321e-04 4.518e-05 2.924 0.003593 **
## GroundSF.1 1.567e-01 7.452e-02 2.103 0.035908 *
## Bedroom -1.920e-02 1.022e-02 -1.878 0.060898 .
## TotalRooms.1 8.573e-02 5.472e-02 1.567 0.117727
## Fireplaces.1 6.660e-02 1.718e-02 3.876 0.000118 ***
## GarageSF 1.331e-04 3.290e-05 4.045 5.95e-05 ***
## OpenPorchSF.1 5.889e-03 2.964e-03 1.987 0.047412 *
## EnclosedPorchSF.1 8.668e-04 4.836e-03 0.179 0.857833
## PorchYN 2.515e-02 1.922e-02 1.309 0.191164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1303 on 580 degrees of freedom
## Multiple R-squared: 0.9006, Adjusted R-squared: 0.8974
## F-statistic: 276.7 on 19 and 580 DF, p-value: < 2.2e-16
vif(ForwardMod3a)
## Quality GroundSF.1 YearBuilt LotArea.1 Condition
## 3.014506 2.204888 3.304207 1.346221 1.433655
## BasementFinSF BasementSF GarageSF Fireplaces.1 YearRemodel
## 1.354543 1.990082 1.701441 1.473637 2.032403
## PorchSF.1 OpenPorchSF.1
## 1.318753 1.422611
vif(BackwardsMod3a)
## LotFrontage LotArea.1 Quality Condition
## 1.118537 1.536198 3.288753 1.481787
## YearBuilt YearRemodel BasementFinSF BasementUnFinSF.1
## 3.737942 2.149550 1.994185 2.020811
## BasementSF FirstSF.1 SecondSF GroundSF.1
## 4.978771 12.576704 13.694797 21.611117
## Bedroom TotalRooms.1 Fireplaces.1 GarageSF
## 2.455339 4.455751 1.547848 1.785758
## OpenPorchSF.1 EnclosedPorchSF.1 PorchYN
## 1.457359 2.785124 2.501627
Our adjusted R-squared values with these remain about the same, but the vif values are much more reasonable. In the new BackwardsMod3a, EnclosedPorchSF becomes very insignificant with a p-value of .86. High vif values in the backwards selction model led us to choose the new forward selection method model for this part. ForwardMod3a has a slightly lower R^2 value than BackwardsMod3a (0.8955 vs. 0.8974). There are fewer insignificant predictors in ForwardsMod3a than in BackwardsMod3a. The vif values for ForwardMod3a are much better than BackwardMod3a. We could make adjustments to BackwardsMod3a to resolve the high vifs and the insignificant p-values, but as we are content with the resulting ForwardMod3a, we will choose to adopt it rather than attempt to save a poor model.
Part 4
plot(ForwardMod3a)
max(abs(ForwardMod3a$residuals))
## [1] 1.544982
which.max(abs(ForwardMod3a$residuals))
## 585
## 585
#max(abs(ForwardMod1$residuals))
rstandard(ForwardMod3a)[585]
## 585
## -12.02693
rstudent(ForwardMod3a)[585]
## 585
## -13.84264
plot(rstudent(ForwardMod3a)~ForwardMod3a$fitted.values, data = MyAmesData)
abline(0,0)
plot(rstandard(ForwardMod3a)~ForwardMod3a$fitted.values, data = MyAmesData)
abline(0,0)
#Showing Linearity in a Multiple Regression Model
plot(Price~ Quality + GroundSF.1 + YearBuilt + LotArea.1 + Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 + YearRemodel + PorchSF.1 + OpenPorchSF.1, MyAmesData4)
#Values for the 585 indexed point. Removing this point later
MyAmesData[585,]
After looking at the residuals vs leverage plot, one residual falls just outside of the 0.5 Cook’s Distance boundary. That point has a 1.8 difference between its standardized and studentized residuals, which are both over 10. Looking at the log(price) vs our predictors shows a fairly linear relationship across most predictors. In the cases where it isn’t linear, there are no better curves we could think of including. Linearity is satisfied.
The raw, standardized, and studentized plots all show that the data is has zero mean beacuse the residuals all appear to be centered around the reference line. The QQ plot appears to show a normal distribution since the bulk of the data falls along the qqline. The residuals vs fitted plot shows independence and constant variance because there is no easily visible pattern to the residuals for independence (the plot attempts to fit a curve but it remains close to 0 and would not be visible without the curve superimposed, and is certainly much better than the original model), and has no fan shape that would indicate non-constant variance.
Part 5
FinalAmesData = MyAmesData4[-585,]
FinalAmesData
FinalMod = lm(formula = Price ~ Quality + GroundSF.1 + YearBuilt + LotArea.1 +
Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 +
YearRemodel + PorchSF.1+ OpenPorchSF.1, data = FinalAmesData)
summary(ForwardMod3a)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF.1 + YearBuilt + LotArea.1 +
## Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 +
## YearRemodel + PorchSF.1 + OpenPorchSF.1, data = MyAmesData4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54498 -0.06373 -0.00087 0.07082 0.46924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.409e+00 7.532e-01 -11.164 < 2e-16 ***
## Quality 9.027e-02 6.621e-03 13.633 < 2e-16 ***
## GroundSF.1 3.672e-01 2.402e-02 15.289 < 2e-16 ***
## YearBuilt 3.475e-03 3.123e-04 11.126 < 2e-16 ***
## LotArea.1 8.634e-02 1.196e-02 7.219 1.63e-12 ***
## Condition 5.504e-02 5.604e-03 9.822 < 2e-16 ***
## BasementFinSF 7.948e-05 1.434e-05 5.544 4.47e-08 ***
## BasementSF 1.197e-04 1.923e-05 6.225 9.17e-10 ***
## GarageSF 1.518e-04 3.241e-05 4.683 3.51e-06 ***
## Fireplaces.1 6.975e-02 1.692e-02 4.123 4.28e-05 ***
## YearRemodel 1.052e-03 3.645e-04 2.886 0.00404 **
## PorchSF.1 7.031e-03 2.796e-03 2.514 0.01219 *
## OpenPorchSF.1 4.963e-03 2.955e-03 1.680 0.09358 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1315 on 587 degrees of freedom
## Multiple R-squared: 0.8976, Adjusted R-squared: 0.8955
## F-statistic: 428.8 on 12 and 587 DF, p-value: < 2.2e-16
summary(FinalMod)
##
## Call:
## lm(formula = Price ~ Quality + GroundSF.1 + YearBuilt + LotArea.1 +
## Condition + BasementFinSF + BasementSF + GarageSF + Fireplaces.1 +
## YearRemodel + PorchSF.1 + OpenPorchSF.1, data = FinalAmesData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68152 -0.06464 -0.00455 0.06490 0.43361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.666e+00 6.547e-01 -13.237 < 2e-16 ***
## Quality 8.105e-02 5.791e-03 13.994 < 2e-16 ***
## GroundSF.1 3.591e-01 2.088e-02 17.204 < 2e-16 ***
## YearBuilt 3.645e-03 2.716e-04 13.422 < 2e-16 ***
## LotArea.1 9.447e-02 1.041e-02 9.076 < 2e-16 ***
## Condition 5.395e-02 4.870e-03 11.079 < 2e-16 ***
## BasementFinSF 7.860e-05 1.246e-05 6.310 5.50e-10 ***
## BasementSF 1.052e-04 1.674e-05 6.286 6.38e-10 ***
## GarageSF 1.864e-04 2.827e-05 6.593 9.65e-11 ***
## Fireplaces.1 7.226e-02 1.470e-02 4.916 1.15e-06 ***
## YearRemodel 1.036e-03 3.167e-04 3.273 0.00113 **
## PorchSF.1 6.518e-03 2.430e-03 2.682 0.00751 **
## OpenPorchSF.1 4.772e-03 2.567e-03 1.859 0.06359 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1142 on 586 degrees of freedom
## Multiple R-squared: 0.9175, Adjusted R-squared: 0.9158
## F-statistic: 542.7 on 12 and 586 DF, p-value: < 2.2e-16
By removing this point, we saw an increase in R^2 from 0.8955 to 0.9158. Also PorchSF.1 became more signifcant (** instead of *) in the final model. Overall, this is a good improvement and our model fits the data well.
newx = data.frame(Quality = 7, GroundSF.1 = log(2314+1), YearBuilt = 1995, LotArea.1 = log(11060+1), Condition = 5, BasementFinSF = 0, BasementSF = 1150, GarageSF = 502, Fireplaces.1 = log(1+1), YearRemodel = 2003, PorchSF.1 = log(274+1), OpenPorchSF.1 = log(274+1))
predict.lm(FinalMod, newx, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 5.510001 5.283118 5.736883
#Lower and Upper Bounds of a house with these characterisitcs
exp(5.283118)*1000
## [1] 196983.1
exp(5.736883)*1000
## [1] 310096.3
A house with these characteristics we predict to cost between 196983.1 and 310096.3 USD with 95% confidence.